import scrapy
from jk_scrapy.items import JkScrapyItem
import time
import os
from urllib import parse

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Connection': 'keep-alive'
}


class JiketimeSpider(scrapy.Spider):
    name = 'jiketime'
    base_url = 'https://d.shikey.com'
    allowed_domains = ['d.shikey.com/jike']
    out_path = "/Volumes/Ralap_ExFAT/Source/GeekTime/one_mananger"

    start_urls = [
        base_url + '/jike/已完结的课程/00%20极客时间福利大礼包/']

    fileSuffixs = [".zip", ".mp3", ".mp4", ".html", ".pdf", ".doc", ".docx", ".jpg", ".jpeg", ".PNG", ".png", ".url",
                   ".rar", ".txt"]

    start_urls = [
        'https://d.shikey.com/jike/%E5%B7%B2%E5%AE%8C%E7%BB%93%E7%9A%84%E8%AF%BE%E7%A8%8B/37%E7%8E%A9%E8%BD%ACgit%E4%B8%89%E5%89%91%E5%AE%A2/'
    ]

    def parse(self, response):

        reqUrl = response.url
        if 'redirect_urls' in response.request.meta:
            realUrl = response.request.meta['redirect_urls']
            reqUrl = realUrl[0]

        reqUrl = parse.unquote(reqUrl)
        print("URL: " + reqUrl)

        isFile = self.is_file(reqUrl)
        if "?preview" in reqUrl:
            return

        if isFile:
            item = JkScrapyItem()
            item["url"] = reqUrl
            path = reqUrl[(reqUrl.rfind("/jike")):]
            localPath = self.out_path + path
            print("文件下载: " + reqUrl)
            print("    ==> " + localPath)

            parentPath = localPath[0:(localPath.rfind('/'))]
            if not os.path.exists(parentPath):
                os.makedirs(parentPath)
            open(localPath, "wb").write(response.body)
            return

        classHrefs = response.xpath('//tr[starts-with(@id,"tr")]/td[1]/a/@href')
        for classHref in classHrefs:
            path = classHref.extract().replace('/ /', '/')
            url = self.base_url + path
            if "?preview" in url:
                continue
            print('-----【解析】==> : ' + url)
            time.sleep(2)
            yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)

    def close(self, spider):
        print("爬虫结束")

    def is_file(self, url):
        lastPath = url[(url.rfind('/') + 1):]
        for suffix in self.fileSuffixs:
            if suffix in lastPath:
                return True
        return False
